Celestin Apprentice 5

home *** CD-ROM | disk | FTP | other *** search

/ Celestin Apprentice 5 / Apprentice-Release5.iso / Source Code / Libraries / DCLAP 6d / dclap6d / SeqPups / appsrc / autoseq.src / CTraceFile.C < prev next >

Wrap

Text File | 1996-07-05 | 45KB | 1,406 lines

// ============================================================================ // CTraceFile.C 80 columns // Reece Hart (reece@ibc.wustl.edu) tab=4 spaces // Washington University School of Medicine, St. Louis, Missouri // This source is hereby released to the public domain. Bug reports, code // contributions, and suggestions are appreciated (to email address above). // - - - - - - - - - - - - - - - - // Please see CTraceFile.H for a description of the CTraceFile class. // // NOTES // * I make frequent use of the following construct: // return (error = <error_t>); // This causes the error flag of this instance of CTraceFile to be set // to the appropriate error, and then returns this value. // although CPeakList contains a lot that's irrelevant to CTraceFile. // * WriteSCF should scale the traces for writing to [0,255] range without // affecting the trace permanently. // ========================================|=================================== #include "CTraceFile.H" #include <iostream.h> #include <fstream.h> #include <string.h> #include <stdio.h> #include <ctype.h> #include <stdlib.h> #include <time.h> #include "CPeakList.H" #include "RInclude.H" #include "RInlines.H" #include "DNA.H" vrsn CTraceFile::version = "94.05.03"; const bool DEBUG = !TRUE; // ============================================================================ // CTraceFile (constructor) // - - - - - - - - - - - - - - - - - - - - - - - - - - // This constructor takes up to two arguments which specify filename and format // for initialization. If neither is provided, the instance is initialized // with empty values. If a filename is provided, then an attempt is made to // read the file in the format specified by fmt, or in a format inferred by // Read if fmt is not specified. ie. The following are valid: // CTraceFile myTrace(); // empty instance // CTraceFile myTrace("myFile"); // attempt read myFile, guessing format // CTraceFile myTrace("myFile",SCF); // attempt to read myFile in SCF fmt // ========================================|=================================== CTraceFile::CTraceFile(const char* fn, format_t fmt): filename(NULL), error(noError), nativeFormat(unknown), whichStrand(top), numPoints(0), min(0), max(0), mean(0), leftCutoff(0), rightCutoff(0), numBases(0), sequence(NULL), basePositions(NULL), comments(NULL) { for (uint traceIndex=0; traceIndex<NUM_BASES; traceIndex++) trace[traceIndex]=NULL; if (fn) error = Read(fn,fmt); } // ============================================================================ // ~CTraceFile (destructor) // - - - - - - - - - - - - - - - - - - - - - - - - - - // Call Deallocate to clean up our dynamically-allocated space. // Note that the lifetime of peaks (a CSequence) is the same as the lifetime // of CTraceFile because it is automatically allocated when an instance of // CTraceFile is allocated. Therefore, it will be disposed of automatically // its destructor will free all space allocated for the peak records. // ========================================|=================================== CTraceFile::~CTraceFile(void) { Deallocate(); } // ============================================================================ // Allocate // - - - - - - - - - - - - - - - - - - - - - - - - - - // Allocate space for the traces, bases, base_positions, and comment. This is // all-or-none: on success, we've allocated all (and returned noError); on // failure we've allocated nothing (and returned memError); // ========================================|=================================== CTraceFile::error_t CTraceFile::Allocate(size_t pt_sz, size_t base_sz, size_t comment_sz) { // Allocate space for trace[] for (uint traceIndex=0; traceIndex<NUM_BASES; traceIndex++) if ( NULL == (trace[traceIndex]=new CTrace<trace_t>(pt_sz)) ) // allocate failed { Deallocate(); return (error = memError); } // Allocate space for sequence and base_positions if ( !(sequence = new char[base_sz]) || !(basePositions = new tracepos[base_sz]) || !(comments = new char[comment_sz+1]) ) { Deallocate(); return (error = memError); } numPoints = pt_sz; numBases = base_sz; rightCutoff = pt_sz-1; return (error = noError); } // ============================================================================ // Deallocate // - - - - - - - - - - - - - - - - - - - - - - - - - - // Make Mom proud and put away all of our toys... ensure that all of our space // is neatly reclaimed. This method is provided separately so that in the case // of errors, we can deallocate stuff in one call. Note that this doesn't // deallocate peaks; the reasoning is twofold: // 1) if we're ready to calculate the peaks, then we've already got our data // allocated in its entirety. Thus, there's no reason to call this except // to cause problems. You wouldn't want to do that. // 2) It's probably not kosher to deallocate peaks... it's been allocated // automatically and doing so might wreak havoc. // ========================================|=================================== void CTraceFile::Deallocate(void) { delete filename; for (uint traceIndex=0; traceIndex<NUM_BASES; traceIndex++) delete trace[traceIndex]; delete sequence; delete basePositions; delete comments; } // ============================================================================ // CalculateStats // - - - - - - - - - - - - - - - - - - - - - - - - - - // Calculates the stats for individual traces and then calculates the overall // stats for the combination of the four. Theoretically, sum may overflow, // although it's unlikely... if we have that many values, something's amiss. // ========================================|=================================== void CTraceFile::CalculateStats(void) { double sum = 0; for (uint traceIndex=A; traceIndex<NUM_BASES; traceIndex++) { trace[traceIndex]->CalculateStats(); if ((traceIndex == A) || // this is first pass || (trace[traceIndex]->Max() > max)) // Max() for this trace > max max = trace[traceIndex]->Max(); // set the new max if ((traceIndex == A) || // ditto for min values (trace[traceIndex]->Min() < min)) min = trace[traceIndex]->Min(); sum += trace[traceIndex]->Mean(); // cumulative value for means } mean = (trace_t)sum/NUM_BASES; // mean over 4 traces } // ============================================================================ // Transform // - - - - - - - - - - - - - - - - - - - - - - - - - - // Applies a 4x4 transformation/orthogonalization matrix to the 4 traces, // producing a new set of traces which replaces the existing set. // Transformation matrics are expected to be 4x4 matrix of the form: // M = [,A] [,C] [,G] [,T] // [A,] mAA mAC mAG mAT // [C,] mCA mCC mCG mCT // [G,] mGA mGC mGG mGT // [T,] mTA mTC mTG mTT // The general equation for the resulting traces is: // R = M O <==> R(T,i) = sum [ mTS x O(S,i) ] // S in {ACGT} // where R is the resulting vector of 4 traces // O is the original vector of 4 traces // M is the 4x4 ({ACGT}x{ACGT}) matrix whose elements m(i,j) are // the cross-term contributions of channel j to channel i // S & T are trace identifiers (Source & Target) in {A,C,G,T} // i loops over the indices of the trace // // ! The existing traces are discarded and replaced by the transformed data. // ========================================|=================================== CTraceFile::error_t CTraceFile::Transform(xform_mtx& matrix) { CTrace<trace_t>* results[NUM_BASES]; // initialize the results array for (uint traceIndex=A; traceIndex<NUM_BASES; traceIndex++) if ( NULL == (results[traceIndex]=new CTrace<trace_t>(numPoints)) ) // allocate failed... deallocate any already new'd { do delete results[--traceIndex]; while(traceIndex); return memError; } // do the transformations uint targetTrace; for ( targetTrace = A;targetTrace<NUM_BASES;targetTrace++) // targetTrace is the index of the resulting trace for (tracepos index=0; index<numPoints; index++) // index is the sample point being transformed { double sum = 0; for (uint sourceTrace=A; sourceTrace<NUM_BASES; sourceTrace++) // sourceTrace is the index of traces contributing to the result sum += matrix[targetTrace][sourceTrace] * (*trace[sourceTrace])[index]; (*results[targetTrace])[index] = (trace_t)sum; } // replace the original traces with the transformed ones. for (targetTrace = A; targetTrace<NUM_BASES;targetTrace++) { delete trace[targetTrace]; trace[targetTrace] = results[targetTrace]; trace[targetTrace]->LeftCutoff(leftCutoff); trace[targetTrace]->RightCutoff(rightCutoff); } // Calculate the stats for each trace; if mean is <0, then scale by -1 // orthogonalization will not be affected // for (targetTrace = A; targetTrace<NUM_BASES;targetTrace++) // { // trace[targetTrace]->CalculateStats(); // if (trace[targetTrace]->Mean()<0) // trace[targetTrace]->Scale(-1); // } CalculateStats(); return noError; } // ============================================================================ // PickPeaks // - - - - - - - - - - - - - - - - - - - - - - - - - - // Calls CTrace<>::PickPeaks for each trace and assimilates the results into a // single list in order of occurrence (a merge sort of the peaks). // ! Because this is still being designed, error handling is lax, although // the skeleton for it exists. // ========================================|=================================== CTraceFile::error_t CTraceFile::PickPeaks(double PeakMeanCoefficient, double ZeroThreshold) { uint traceIndex=0; // // Calculates stats to ensure that each trace has correct min, max, & // mean values. // CalculateStats(); // // Generate a list of peaks for each trace // for (traceIndex=A; traceIndex<NUM_BASES; traceIndex++) { trace_t MinPeakHeight = (trace_t)(PeakMeanCoefficient * trace[traceIndex]->Mean()); if (trace[traceIndex]->PickPeaks(MinPeakHeight,ZeroThreshold)==noError) trace[traceIndex]->Peaks().Analyze(*trace[traceIndex]); else return peakPickError; } return AssimilatePeaks(); } // ============================================================================ // AssimilatePeaks // - - - - - - - - - - - - - - - - - - - - - - - - - - // Assumes that each trace in the tracefile has a list of peaks and collates // them into our own list for the tracefile. // ========================================|=================================== CTraceFile::error_t CTraceFile::AssimilatePeaks() { uint traceIndex=0; // // Assimilate the peaks into one CSequence of PeakRec's // by popping the 'least' peak from the front of the appropriate // list. When we've done sum total pops, we're done. (This is easier // and faster than checking if all lists are empty at every iteration.) // ulong sum=0; uint next_least[NUM_BASES]={0}; // next least elem in each trace for (traceIndex=A; traceIndex<NUM_BASES; traceIndex++) sum += trace[traceIndex]->Peaks().Size(); while (sum) { uint least_trace = A; tracepos least_pos = numPoints; // just want something that's // guaranteed to be larger than // our first real hit // Search for the trace which has the least element (ie. the trace // with a peak at an index less than that for peaks of any other trace) for (traceIndex=A; traceIndex<NUM_BASES; traceIndex++) { CPeakList& ind_peaks = trace[traceIndex]->Peaks(); if ( (next_least[traceIndex] < ind_peaks.Size()) && (ind_peaks[next_least[traceIndex]].center < least_pos) ) { least_trace = traceIndex; least_pos = ind_peaks[next_least[traceIndex]].center; } } // So now the least peak has been identified. // We'll fill in the whichTrace field and append it to the assimilated // list of peaks PeakRec pr = trace[least_trace]->Peaks()[next_least[least_trace]]; pr.whichTrace = (base_t)least_trace; peaks.Append(pr); // Set the next_least index to the successor of this victim and // decrement the sum counter... 1 down and sum [sic] more to go. :-| next_least[least_trace]++; sum--; } return noError; } // ============================================================================ // PrunePeaks // - - - - - - - - - - - - - - - - - - - - - - - - - - // Simply step through the sequence of peaks and in the case where adjacent // peaks are less than minSeparation apart, remove the peak with the lesser // probability. // A list of the pairwise peak contentions and their resolution will be output // if an output stream is provided. // // ALGORITHM: // Given a set of peak records indexed 0..s with fields: // center - position in trace of the center of the peak // PH - probability of the peak // and a minimum separation, minSep, prune the list by removing the less // probable peak when adjacent peak centers are within minSep. // let peakA be the index of the first peak in the comparison // let peakA=0 // while peakA<=s-2 do // let peakB = peakA + 1 // if peakB.center - peakA.center <= minSep then // if peakA.PH < peakB.PH then // remove peakA // else // remove peakB // fi // else // increment peakA // fi // elihw // // ! This is intended to be only a first cut at a pruning algorithm. It has // many problems, including the possibility that a series of peaks all // separated by less than minSeparation between adjacent members will // result in only 1 of the members being selected, when perhaps more than // one peak should result from the collection. // ========================================|=================================== CTraceFile::error_t CTraceFile::PrunePeaks( tracepos minSeparation, ostream* os) { ulong peakNo=0; while(peakNo<=peaks.Size()-2) if (peaks[peakNo+1].center-peaks[peakNo].center < minSeparation) { // victim is the peak which is less probable [ie. lesser P(H|D)] ulong victim = (peaks[peakNo+1].PHD < peaks[peakNo].PHD ? peakNo+1: peakNo); base_t victimTrace = peaks[victim].whichTrace; if (os) { *os << setfill('-') << setw(80) << "" << setfill(' ') << endl; *os << Tag(victim==peakNo) << peaks[peakNo]; *os << Tag(victim==peakNo+1) << peaks[peakNo+1]; } // remove the peak record from the trace's peak list (*trace[victimTrace]).Peaks().RemovePeak(peaks[victim].center); // and remove the peak from the tracefile's peak list peaks.Remove(victim); } else peakNo++; // end while if (os) *os << setfill('-') << setw(80) << "" << setfill(' ') << endl; return noError; } ostream& operator<<(ostream& os, CTraceFile& ctf) { tracepos length=ctf.rightCutoff - ctf.leftCutoff + 1; os << "filename:\t" << ctf.filename << endl << "native format:\t" << FileFormatAbbr(ctf.nativeFormat) << " [" << FileFormatDesc(ctf.nativeFormat) << "]" << endl << "strand:\t\t" << (short)ctf.whichStrand << endl << "total size:\t" << ctf.numPoints << endl << "window:\t\t" << "[" << ctf.leftCutoff << "," << ctf.rightCutoff << "]" << " (" << length << " point" << Plural(length) << ")" << endl << "min/max/mean:\t" << ctf.min << "/" << ctf.max << "/" << ctf.mean << " (within window)" << endl << "sequence len:\t"<< ctf.numBases << endl << "comments" << endl << "--------" << endl << ctf.comments << endl << "peaks picked:\t"<< ctf.peaks.Size() << endl; if (ctf.peaks.Size() != 0) { os << setw(3) << "#" << " " << PeakRecHeader; for(uint i = 0; i<ctf.peaks.Size(); i++) os << setw(3) << i+1 << " " << ctf.peaks[i]; } return os; } // ============================================================================ // ### # ####### // # # # # // # # # # // # # # # // # # # # // # # # # // ### # ####### // // General I/O Methods // - - - - - - - - - - - - - - - - - - - - - - - - - - // public: // Read(const char*, format) - Reads from named file in specified format, or // inferred format if not specified. // Write(const char*, format) - Writes to named file in specified format, or // native format if not specified. // // private: // Read(FILE*, format) - Reads from FILE* (opened with at least "rb" // permissions in specified format (mandatory). This just dispatches to // the appropriate file format reader (ie. ReadSCF). // Write(FILE*, format) - Writes to FILE* (opened with at least "wb" // permissions) in specified format (mandatory). This just dispatches to // the appropriate file format writer (ie. WriteSCF). // // NOTES // * File I/O must be handled through the const char* reader and writer. // All other methods are declared private. // * The private methods do not set the class's error flag; this allows the // calling function (ie. the public methods) to handle the error as // appropriate, or to set the error flag if desired. (ie. we may wish to // retry with a different format. // ? Currently, if an error occurs during reading, it's possible to return // (with error) an incompletely-read trace. I think that this is not // desired, but I'm still weighing the merits. // ========================================|=================================== // ============================================================================ // Read(const char*, format_t) // - - - - - - - - - - - - - - - - - - - - - - - - - - // Takes a filename and optional format. If the format is not specified, I // attempt to determine it by FileFormat (see FileFormat.H). If it still can't // be determined, we're sunk; if it can be determined, we attempt to read it. // // NOTES // ? should the trace be complete deallocated if it fails? // ========================================|=================================== CTraceFile::error_t CTraceFile::Read(const char* fn, format_t fmt) { error_t err; FILE* fp; if (unknown == fmt) // user didn't tell us format... try { // to determine it for ourselves fmt = FileFormat(fn); if (unknown == fmt) // drat! still unknown return (error = unkFmtError); } if (fp = fopen(fn,"rb")) { // file opened successfully err = Read(fp,fmt); fclose(fp); } else { err = fileDoesntExistError; } if (!err) { filename = new char[strlen(fn)]; strcpy(filename,fn); nativeFormat = fmt; CalculateStats(); } return (error = err); } // ============================================================================ // Write(const char*, format_t) // - - - - - - - - - - - - - - - - - - - - - - - - - - // Takes a filename and optional format. If the format is not specified, the // native format is assumed. If we're writing in format other than the one // used to read the tracefile, the comment field is duplicated and a temporary // version is constructed which annotates the conversion process. // // NOTES // ! I don't check to see if the file exists. I should do this... // ========================================|=================================== CTraceFile::error_t CTraceFile::Write(const char* fn, format_t fmt) { error_t err; FILE* fp; char* tempComment=NULL; // temporary comment char* origComment=comments; // saved pointer to original comment if (DEBUG) cerr << "Write(char*,format_t): Entering" << endl; if ( 0 == numPoints ) return (error = emptyError); if (fmt == unknown) // user didn't tell us format... fmt = nativeFormat; // assume nativeFormat else if (fmt != nativeFormat) // user specified a format different from native... make a comment // that this was translated by CTraceFile. { char buf[1000] = {0}; strcat(buf,"conv = by CTraceFile "); strcat(buf,version); strcat(buf," from "); strcat(buf,FileFormatDesc(nativeFormat)); strcat(buf," ("); strcat(buf,FileFormatAbbr(nativeFormat)); strcat(buf,")"); strcat(buf," to "); strcat(buf,FileFormatDesc(fmt)); strcat(buf," ("); strcat(buf,FileFormatAbbr(fmt)); strcat(buf,")"); strcat(buf," on "); time_t tm = time(NULL); // char* tmsp = ctime(&tm); // strcat(buf,tmsp); struct tm* tp=localtime(&tm); strftime(buf+strlen(buf),1000-strlen(buf),"%Y-%b-%d %H:%M%Z\n",tp); // This is the original conversion comment; I've opted for a style // that more closely matches makeSCF's for consistency. // strcat(buf,tmsp); // strcpy(buf+strlen(buf)-1,"- tracefile converted from "); // strcat(buf,FileFormatDesc(nativeFormat)); // strcat(buf," to "); // strcat(buf,FileFormatDesc(fmt)); // strcat(buf," by CTraceFile.\n"); // make space for the temporary comment tempComment = new char[strlen(comments)+strlen(buf)+1]; if (tempComment == NULL) return memError; strcpy(tempComment,comments); // copy old comment to new comment strcat(tempComment,buf); // add the conversion comment comments = tempComment; // we'll write, then swap them back } if (DEBUG) cerr << "Write(char*,format_t): Writing" << endl; if ((fp = fopen(fn,"wb")) != NULL) { // file opened successfully if ((err = Write(fp,fmt)) != noError) { fclose(fp); remove(fn); // half-complete files are useless } else fclose(fp); // write succeeded } else err = ioError; if (DEBUG) cerr << "Write(char*,format_t): Exiting" << endl; if (tempComment != NULL) // this write involved a conversion { // we need to restore the original comments=origComment; // comment and kill the space for delete tempComment; // the temporary } return (error = err); } // ============================================================================ // Read(FILE*, format_t) [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // This is the dispatcher to the appropriate format reader. It is called by // Read(const char*, format_t), but not called directly. // ========================================|=================================== CTraceFile::error_t CTraceFile::Read(FILE* fp, format_t fmt) { if (DEBUG) cerr << "Read(FILE*,format_t): Entering; fmt = "<< (short)fmt<< endl; if (NULL == fp) return ioError; // else... error_t err = noError; switch (fmt) { case unknown: err = unkFmtError; break; case SCF: err = ReadSCF(fp); break; case ABI0: err = ReadABI(fp,0); break; case ABI1: err = ReadABI(fp,1); break; case ABI: err = ReadABI(fp,2); break; case ALF: err = ReadALF(fp); break; default: err = fmtNotSuppError; } if (DEBUG) cerr << "Read(FILE*,format_t): Exiting" << endl; return err; } // ============================================================================ // Write(FILE*, format_t) [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // This is the dispatcher to the appropriate format writer. It will most // likely be called by Write(const char*, format_t), but not called directly. // ========================================|=================================== CTraceFile::error_t CTraceFile::Write(FILE* fp, format_t fmt) { error_t err = noError; if (NULL == fp) return ioError; // else... if (DEBUG) cerr << "Write(FILE*,format_t): Entering" << endl; switch (fmt) { case unknown: err = unkFmtError; break; case SCF: err = WriteSCF(fp); break; case ABI: err = WriteABI(fp); break; case ALF: err = WriteALF(fp); break; default: err = fmtNotSuppError; } if (DEBUG) cerr << "Write(FILE*,format_t): Exiting" << endl; return err; } // ============================================================================ // ##### ##### ####### ### # ####### // # # # # # # # # # // # # # # # # # // ##### # ##### # # # # // # # # # # # # // # # # # # # # # # // ##### ##### # ### # ####### // // SCF (Standard Chromatogram Format) I/O Methods // - - - - - - - - - - - - - - - - - - - - - - - - - - // File format inferred from ted source (see CTraceFile.H). // // SCF_File = // offset description // --------------- -------------------------------- // 0 <SCF_Header> // trace_offset <SCF_Sample> x num_samples // sequence_offset <SCF_Bases> x num_bases // comment_offset comment of length comment_length // ========================================|=================================== const long SCF_MAGIC_COOKIE = ((('.'<<8)+'s'<<8)+'c'<<8)+'f'; struct SCF_Header { long magic_cookie; // = SCF_MAGIC_COOKIE long num_points; // number of trace points (per base) long trace_offset; // position of trace data in file long num_bases; // number of bases in sequence long left_cutoff; // unreadable < left cutoff long right_cutoff; // unreadable > right cutoff long sequence_offset; // position of sequence data in file long comment_length; // strlen of comment long comment_offset; // position of comment in file long spare[23]; friend size_t fread(FILE* fp, SCF_Header& header) { return ::fread(&header,sizeof(SCF_Header),1,fp); } friend size_t fwrite(FILE* fp, const SCF_Header& header) { return ::fwrite(&header,sizeof(SCF_Header),1,fp); } }; struct SCF_Sample // note that the values for each { // trace are interleaved. ie byte sample_A; // A1-C1-G1-T1-A2-C2-G2-...-Gn-Tn byte sample_C; // where Ni = value of point i of byte sample_G; // trace N. byte sample_T; friend size_t fread(FILE* fp, SCF_Sample& sample) { return ::fread(&sample,sizeof(SCF_Sample),1,fp); } friend size_t fwrite(FILE* fp, const SCF_Sample& sample) { return ::fwrite(&sample,sizeof(SCF_Sample),1,fp); } }; struct SCF_Base { long trace_index; // location of base as called byte prob_A; // probability that base = A byte prob_C; // C byte prob_G; // G byte prob_T; // T char base; // base that was chosen byte _unused1; // unused? byte _unused2; // unused? byte _unused3; // unused? friend size_t fread(FILE* fp, SCF_Base& base) { return ::fread(&base,sizeof(SCF_Base),1,fp); } friend size_t fwrite(FILE* fp, const SCF_Base& base) { return ::fwrite(&base,sizeof(SCF_Base),1,fp); } }; // ============================================================================ // ReadSCF [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // Read a SCF file; called exclusively by Read(FILE*,format_t). // ========================================|=================================== CTraceFile::error_t CTraceFile::ReadSCF(FILE* fp) { uint position; if (ferror(fp)) return ioError; // // Read header // SCF_Header header; rewind(fp); // take it from the top... fread(fp,header); // Read the header if (ferror(fp)) // check for error return ioError; nativeFormat = SCF; // set our instance's data members whichStrand = top; // // Allocate space for traces, sequence, etc. // this is all-or-none: we either want all our required space or none at // all. error_t err = Allocate((size_t)header.num_points,\ (size_t)header.num_bases,\ (size_t)header.comment_length); if (noError != err) return err; // // read the traces // fseek(fp,header.trace_offset,SEEK_SET); for (position=0; position<numPoints; position++) { SCF_Sample sample; fread(fp,sample); (*trace[A])[position]=sample.sample_A; (*trace[C])[position]=sample.sample_C; (*trace[G])[position]=sample.sample_G; (*trace[T])[position]=sample.sample_T; } if (ferror(fp)) return ioError; // // read the sequence & base positions // fseek(fp,header.sequence_offset,SEEK_SET); for (uint traceIndex=0; traceIndex<numBases; traceIndex++) { SCF_Base base; fread(fp,base); sequence[traceIndex]=base.base; basePositions[traceIndex]=(tracepos)base.trace_index; } if (ferror(fp)) return ioError; // // read the comment // fseek(fp,header.comment_offset,SEEK_SET); fread(comments,(size_t)header.comment_length,1,fp); if (ferror(fp)) return ioError; return noError; } // ============================================================================ // WriteSCF [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // Write a SCF file; called exclusively by Write(FILE*,format_t). // ========================================|=================================== CTraceFile::error_t CTraceFile::WriteSCF(FILE* fp) { SCF_Header header; uint position; if (DEBUG) cerr << "WriteSCF: Entering" << endl; // // prepare the header and calculate file offsets for trace information // header.magic_cookie = SCF_MAGIC_COOKIE; header.num_points = numPoints; header.num_bases = numBases; header.left_cutoff = leftCutoff; header.right_cutoff = rightCutoff; header.comment_length = strlen(comments); header.trace_offset = sizeof(SCF_Header); header.sequence_offset = header.trace_offset+sizeof(SCF_Sample)*numPoints; header.comment_offset = header.sequence_offset+sizeof(SCF_Base)*numBases; if (DEBUG) cerr << "WriteSCF: header" << endl; // // write the header // rewind(fp); // take it from the top... fwrite(fp, header); // write the header if (ferror(fp)) // check for error return ioError; if (DEBUG) cerr << "WriteSCF: traces" << endl; // // write the traces // fseek(fp,header.trace_offset,SEEK_SET); // move to traces for (position=0; position<numPoints; position++) { SCF_Sample sample; sample.sample_A=(byte)(*trace[A])[position]; sample.sample_C=(byte)(*trace[C])[position]; sample.sample_G=(byte)(*trace[G])[position]; sample.sample_T=(byte)(*trace[T])[position]; fwrite(fp,sample); } if (ferror(fp)) return ioError; if (DEBUG) cerr << "WriteSCF: sequence" << endl; // // write the sequence & base positions // fseek(fp,header.sequence_offset,SEEK_SET); // move to traces for (position=0; position<numBases; position++) { SCF_Base base; base.base=sequence[position]; base.trace_index=basePositions[position]; fwrite(fp,base); } if (ferror(fp)) return ioError; if (DEBUG) cerr << "WriteSCF: comment" << endl; // // write the comment // fseek(fp,header.comment_offset,SEEK_SET); fwrite(comments,(size_t)header.comment_length,1,fp); if (ferror(fp)) return ioError; if (DEBUG) cerr << "WriteSCF: Exiting" << endl; return noError; } // ============================================================================ // # ###### ### ### # ####### // # # # # # # # # # // # # # # # # # # # // # # ###### # # # # # // ####### # # # # # # # // # # # # # # # # # // # # ###### ### ### # ####### // // ABI format I/O methods // - - - - - - - - - - - - - - - - - - - - - - - - - - // These routines are copied almost verbatim from seqIOABI.[hc] written by // LaDeana Hillier, although I've made extensive modifications to certain // sections. // ========================================|=================================== typedef ushort int2; // Two byte short assumption typedef ulong int4; // Four byte long assumption const long DATAtag = ((long) ((((('D'<<8)+'A')<<8)+'T')<<8)+'A'); const long BASEtag = ((long) ((((('P'<<8)+'B')<<8)+'A')<<8)+'S'); const long BASEPOStag = ((long) ((((('P'<<8)+'L')<<8)+'O')<<8)+'C'); const long SPACINGtag = ((long) ((((('S'<<8)+'P')<<8)+'A')<<8)+'C'); const long SIGNALtag = ((long) ((((('S'<<8)+'/')<<8)+'N')<<8)+'%'); const long FWO_tag = ((long) ((((('F'<<8)+'W')<<8)+'O')<<8)+'_'); const long MCHNtag = ((long) ((((('M'<<8)+'C')<<8)+'H')<<8)+'N'); const long PDMFtag = ((long) ((((('P'<<8)+'D')<<8)+'M')<<8)+'F'); const long SMPLtag = ((long) ((((('S'<<8)+'M')<<8)+'P')<<8)+'L'); const long PPOStag = ((long) ((((('P'<<8)+'P')<<8)+'O')<<8)+'S'); const long PRIMERtag = ((long) ((((('P'<<8)+'P')<<8)+'O')<<8)+'S'); const size_t INDEXPO = 26; // offset of offset to index const int INDEX_ENTRY_LENGTH= 28; inline short baseIndex(char B) { return ((B)=='C'?0:(B)=='A'?1:(B)=='G'?2:3); } // ============================================================================ // readABIInt2 // ========================================|=================================== static bool readABIInt2( FILE *fp, int2 *i2) { unsigned char buf[sizeof(int2)]; if (fread(buf, sizeof(buf), 1, fp) != 1) return (FALSE); *i2 = (int2) (((unsigned short)buf[1]) + ((unsigned short)buf[0]<<8)); return (TRUE); } // ============================================================================ // readABIInt4 // ========================================|=================================== static bool readABIInt4( FILE *fp, int4 *i4) { unsigned char buf[sizeof(int4)]; if (fread(buf, sizeof(buf), 1, fp) != 1) return (FALSE); *i4 = (int4) (((unsigned long)buf[3]) + ((unsigned long)buf[2]<<8) + ((unsigned long)buf[1]<<16) + ((unsigned long)buf[0]<<24)); return (TRUE); } // ============================================================================ // getIndexEntryLW // - - - - - - - - - - - - - - - - - - - - - - - - - - // From the ABI results file connected to `fp' whose index starts at byte // offset `indexO', return in `val' the `lw'th long word from the `count'th // entry labeled `label'. The result indicates success. // ========================================|=================================== bool getIndexEntryLW ( FILE* fp, long indexO, long label, long count, int lw, int4* val) { int entryNum=-1; int i; int4 entryLabel, entryLw1; // if (DEBUG) //cerr<<"getIndexEntryLW(label="<<label<<",count="<<count<<",lw="<<lw<<")"<< endl; do { entryNum++; if (fseek(fp, indexO+(entryNum*INDEX_ENTRY_LENGTH), SEEK_SET) != 0) return FALSE; if (!readABIInt4(fp, &entryLabel)) return FALSE; if (!readABIInt4(fp, &entryLw1)) return FALSE; } while (!(entryLabel == label && entryLw1 == count)); for (i=2; i<=lw; i++) if (!readABIInt4(fp, val)) return FALSE; return TRUE; } // ============================================================================ // getIndexEntryW // - - - - - - - - - - - - - - - - - - - - - - - - - - // From the ALF results file connected to `fp' whose index starts at byte // offset `indexO', return in `val' the `lw'th word (int2) from the entry // labeled `label'. The result indicates success. // ========================================|=================================== bool getIndexEntryW ( FILE* fp, long indexO, long label, int lw, int2* val) { int entryNum=-1; int i; int4 entryLabel; int4 jval; do { entryNum++; if (fseek(fp, indexO+(entryNum*INDEX_ENTRY_LENGTH), SEEK_SET) != 0) return FALSE; if (!readABIInt4(fp, &entryLabel)) return FALSE; } while (entryLabel != label); for (i=2; i<lw; i++) if (!readABIInt4(fp, &jval)) return FALSE; if (!readABIInt2(fp, val)) return FALSE; return TRUE; } // ============================================================================ // ReadABI [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // Read a ABI file; called exclusively by Read(FILE*,format_t). // Read the ABI format sequence with name `fn' into `seq'. // // This code is a rehash of seqIOABI.c, significantly modified for use here. // bottom strand designations are completely ignored here. // ========================================|=================================== CTraceFile::error_t CTraceFile::ReadABI(FILE* fp, short whichSet) { if (DEBUG) cerr << "ReadABI: Entering; set = " << whichSet << endl; const size_t COMMENT_LEN = 2048; // max comment length; const short TRACE_INDEX = NUM_BASES*whichSet+1; // index to offset of 1st trace in set error_t err = noError; char line[128]; int4 ppos; bool bottom = FALSE; char* enzyme = NULL; uint whichTrace; ulong numPoints; ulong numBases; // int2 primerPos; int2 traceValue; int4 fwo_; // base -> lane mapping int4 signalO; int4 indexO; // index int4 baseO; // bases int4 basePosO; // base positions int4 dataCO; // C trace int4 dataAO; // A trace int4 dataGO; // G trace int4 dataTO; // T trace int4 MCHN_O; // machine name int4 PDMF_O; // dye primer guff int4 SMPL_O; // sample name int i; if (DEBUG) cerr << "ReadABI: 1.0" << endl; // get the index offset if ((fseek(fp, INDEXPO, SEEK_SET) != 0) || (!readABIInt4(fp, &indexO))) return ioError; if (DEBUG) cerr << "ReadABI: 1.1" << endl; // get the number of points if (!getIndexEntryLW(fp,indexO,DATAtag,TRACE_INDEX,3,&numPoints)) return ioError; // get the number of bases if (DEBUG) cerr << "ReadABI: 1.2" << endl; if (!getIndexEntryLW(fp,indexO,BASEtag,1,3,&numBases)) return ioError; // not needed; in fact it's apparently superfluous and incorrect when // compared with similar code to do the same thing, below. -- Reece #if (FALSE) // get Primer Position...where the ABI file found the start of if (DEBUG) cerr << "ReadABI: 1.3" << endl; if (!getIndexEntryW(fp,indexO,PPOStag,6,&primerPos)) return ioError; #endif if (DEBUG) cerr << "ReadABI: 1.4" << endl; // get bases offset if (!getIndexEntryLW(fp,indexO,BASEtag,1,5,&baseO)) return ioError; if (DEBUG) cerr << "ReadABI: 1.5" << endl; // get base positions offset if (!getIndexEntryLW(fp,indexO,BASEPOStag,1,5,&basePosO)) return ioError; if (DEBUG) cerr << "ReadABI: 1.6" << endl; // The order of the DATA fields is determined by the field FWO_ // Juggle around with data pointers to get it right int4* dataxO[NUM_BASES]; dataxO[0] = &dataCO; dataxO[1] = &dataAO; dataxO[2] = &dataGO; dataxO[3] = &dataTO; // Get the Freak World Out (FWO?) field ... if (!getIndexEntryLW(fp,indexO,FWO_tag,1,5,&fwo_)) return ioError; //Get the positions of the four traces if (!( getIndexEntryLW(fp,indexO,DATAtag,TRACE_INDEX, 5,dataxO[baseIndex((char)(fwo_>>24&BYTE[0]))]) && getIndexEntryLW(fp,indexO,DATAtag,TRACE_INDEX+1, 5,dataxO[baseIndex((char)(fwo_>>16&BYTE[0]))]) && getIndexEntryLW(fp,indexO,DATAtag,TRACE_INDEX+2, 5,dataxO[baseIndex((char)(fwo_>>8&BYTE[0]))]) && getIndexEntryLW(fp,indexO,DATAtag,TRACE_INDEX+3, 5,dataxO[baseIndex((char)(fwo_&BYTE[0]))]) )) return ioError; // // Offsets for critical data have been obtained. Offsets for non-critical // info will be determined later, mostly for comment field. // Now we're reade to get the critical information. // if (DEBUG) cerr << "ReadABI: 2-allocate" << endl; // // Allocate space for data, bases, and comments // if (err = Allocate((size_t)numPoints,(size_t)numBases,COMMENT_LEN)) return err; // // Read in the traces using the offsets above // int4 dataOffset[NUM_BASES]; dataOffset[A]=dataAO; dataOffset[C]=dataCO; dataOffset[G]=dataGO; dataOffset[T]=dataTO; for (whichTrace=A;whichTrace<=T;whichTrace++) { if (DEBUG) cerr << "ReadABI: 2" << DNA_BASES[whichTrace] << "; offset = " << dataOffset[whichTrace] << endl; if (fseek(fp, dataOffset[whichTrace], SEEK_SET)) return ioError; for (i=0;i<numPoints;i++) if (readABIInt2(fp, &traceValue)) (*trace[whichTrace])[i] = traceValue; else return ioError; } // // read in the sequence // if (DEBUG) cerr << "ReadABI: 3" << endl; if (fseek(fp, baseO, SEEK_SET)) return ioError; for (i=0;i<numBases;i++) { int ch; if ((ch = fgetc(fp)) == EOF) return ioError; sequence[i] = (ch == 'N') ? '-' : ch; } // // read base positions // if (DEBUG) cerr << "ReadABI: 4" << endl; for (i=0;i<numBases;i++) if (!readABIInt2(fp, (int2*)&basePositions[i])) return ioError; // // Gather useful information // such as signal strength, machine name, etc. We don't have the file // offsets for these yet, so we must get them for each item. Because // they're optional, errors are ignored after this point. // if (DEBUG) cerr << "ReadABI: 5" << endl; // Get Signal Strength Offset if (getIndexEntryLW(fp,indexO,SIGNALtag,1,5,&signalO)) { int2 C,A,G,T; int2* base[4]; base[0] = &C; base[1] = &A; base[2] = &G; base[3] = &T; if (fseek(fp, signalO, SEEK_SET) >= 0 && readABIInt2(fp, base[baseIndex((char)(fwo_>>24&255))]) && readABIInt2(fp, base[baseIndex((char)(fwo_>>16&255))]) && readABIInt2(fp, base[baseIndex((char)(fwo_>>8&255))]) && readABIInt2(fp, base[baseIndex((char)(fwo_&255))])) { sprintf(line,"avg_signal_strength = C:%d A:%d G:%d T:%d\n",C,A,G,T); strcat(comments,line); } } // signal strength // the following returns absurd results (ie. on order of 10E10) #if (FALSE) // Get the spacing.. it's a float but don't worry yet int4 spacing; if (getIndexEntryLW(fp,indexO,SPACINGtag,1,5,&spacing)) { sprintf(line,"avg_spacing = %lf\n",spacing); strcat(comments,line); } #endif // Get primer position if (getIndexEntryLW(fp,indexO,PPOStag,1,5,&ppos)) { // ppos stored in MBShort of pointer primerPosition = ppos>>16; sprintf(line,"primer_position = %d\n",primerPosition); strcat(comments,line); } // machine name if ((getIndexEntryLW(fp,indexO,MCHNtag,1,5,&MCHN_O)) && (fseek(fp, MCHN_O, SEEK_SET) >= 0)) { unsigned char l; char buffer[256]; // Pascal-type string; len=buffer[0] fread(&l,sizeof(char),1,fp); fread(buffer,l,1,fp); sprintf(line,"machine_name = %.*s\n",l,buffer); strcat(comments,line); } // Dye Primer Offset if ((getIndexEntryLW(fp,indexO,PDMFtag,1,5,&PDMF_O)) && (fseek(fp, PDMF_O, SEEK_SET) >= 0)) { unsigned char l; char buffer[256]; // Pascal-type string; len=buffer[0] fread(&l,sizeof(char),1,fp); fread(buffer,l,1,fp); sprintf(line,"dye_primer = %.*s\n",l,buffer); strcat(comments,line); } // sample name if ((getIndexEntryLW(fp,indexO,SMPLtag,1,5,&SMPL_O)) && (fseek(fp, SMPL_O, SEEK_SET) >= 0)) { unsigned char l; char buffer[256]; // Pascal-type string; len=buffer[0] fread(&l,sizeof(char),1,fp); fread(buffer,l,1,fp); sprintf(line,"sample_name = %.*s\n",l,buffer); strcat(comments,line); } if (DEBUG) cerr << "ReadABI: Exiting normally" << endl; return noError; } // ============================================================================ // WriteABI [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // Write a ABI file; called exclusively by Write(FILE*,format_t). // ========================================|=================================== CTraceFile::error_t CTraceFile::WriteABI(FILE* fp) { fp = NULL; return fmtNotSuppError; } // ============================================================================ // # # ####### ### # ####### // # # # # # # # # // # # # # # # # # // # # # ##### # # # # // ####### # # # # # # // # # # # # # # # // # # ####### # ### # ####### // // ALF format I/O methods // - - - - - - - - - - - - - - - - - - - - - - - - - - // // ========================================|=================================== // ============================================================================ // ReadALF [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // Read a ALF file; called exclusively by Read(FILE*,format_t). // ========================================|=================================== CTraceFile::error_t CTraceFile::ReadALF(FILE* fp) { fp = NULL; return fmtNotSuppError; } // ============================================================================ // WriteALF [private] // - - - - - - - - - - - - - - - - - - - - - - - - - - // Write a ALF file; called exclusively by Write(FILE*,format_t). // ========================================|=================================== CTraceFile::error_t CTraceFile::WriteALF(FILE* fp) { fp = NULL; return fmtNotSuppError; }